2.1. Incomplete dataset
load("/Volumes/LGBT Project data/Multiple Imputation/d_2021_incomplete.RData")
summary( d_2021_incomplete )
sampling_strata_region calibrated_weight no.of.population sexual_identity_2021 age sex
Täby : 685 Min. : 6.221 Min. : 7470 Heterosexual :20074 Min. : 16.00 Male :10648
Danderyd : 673 1st Qu.: 42.552 1st Qu.: 28286 Homosexual : 366 1st Qu.: 38.00 Female:12418
Skarpnäck : 665 Median : 69.147 Median : 46076 Bisexual : 584 Median : 53.00
Värmdö : 662 Mean : 79.226 Mean : 48535 None of the above: 1534 Mean : 52.19
Kungsholmen: 662 3rd Qu.:106.412 3rd Qu.: 63834 NA's : 508 3rd Qu.: 67.00
Södermalm : 662 Max. :370.732 Max. :106702 Max. :100.00
(Other) :19057
country_of_birth education income marital_status weight_strata
Sweden :18101 <=9 years : 2771 Min. : 0 Never married : 8130 20 : 962
Europe : 2410 10-12 years: 7948 1st Qu.: 2389 Currently married:10817 12 : 941
Outside Europe: 2555 >=13 years :11875 Median : 3324 Other : 4119 18 : 937
NA's : 472 Mean : 4115 4 : 936
3rd Qu.: 4478 2 : 934
Max. :497851 17 : 933
NA's :40 (Other):17423
sapply( d_2021_incomplete, class ) # all continuous variables are numeric, and all categorical variables are factor
sampling_strata_region calibrated_weight no.of.population sexual_identity_2021 age sex
"factor" "numeric" "numeric" "factor" "numeric" "factor"
country_of_birth education income marital_status weight_strata
"factor" "factor" "numeric" "factor" "factor"
miss_var_summary( d_2021_incomplete ) # 2.2% missing in sexual_identity_2021, 2.1% in education, and 0.2% in income
2.2. Two-level multivariate normal imputation
# specify imputation model
# fml_imp_2021 <- sexual_identity_2021 + education + income ~ 1 + age*sex + country_of_birth + marital_status + ( 1 | weight_strata )
# final imputation with the chosen number of iterations
# imp_final_2021 <- jomoImpute( data = d_2021_incomplete,
# formula = fml_imp_2021,
# random.L1 = "full",
# n.burn = 2000,
# n.iter = 1000,
# m = 20,
# seed = 12345
# ) # took around 1 hour and a half
load("/Volumes/LGBT Project data/Multiple Imputation/imp_final_2021.RData")
summary( imp_final_2021 )
Call:
jomoImpute(data = d_2021_incomplete, formula = fml_imp_2021,
random.L1 = "full", n.burn = 2000, n.iter = 1000, m = 20,
seed = 12345)
Cluster variable: weight_strata
Target variables: sexual_identity_2021 education income
Fixed effect predictors: (Intercept) age sex country_of_birth marital_status age:sex
Random effect predictors: (Intercept)
Performed 2000 burn-in iterations, and generated 20 imputed data sets,
each 1000 iterations apart.
Potential scale reduction (Rhat, imputation phase):
Min 25% Mean Median 75% Max
Beta: 1.000 1.001 1.006 1.002 1.009 1.033
Psi: 1.000 1.001 1.002 1.001 1.002 1.005
Sigma: 1.000 1.000 1.035 1.002 1.026 1.820
Largest potential scale reduction:
Beta: [1,6], Psi: [6,6], Sigma: [5,3]
Missing data per variable:
weight_strata sexual_identity_2021 education income sampling_strata_region calibrated_weight no.of.population age sex country_of_birth
MD% 0 2.2 2.0 0.2 0 0 0 0 0 0
marital_status
MD% 0
plot( imp_final_2021, trace = "all", print = "beta" )

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

2.3. Validate imputed data
# extract imputed datasets
original_data_2021 <- mitmlComplete( imp_final_2021, print = 0 ) # extract original incomplete dataset
implist_2021 <- mitmlComplete( imp_final_2021, print = "all" ) # extract all imputed datasets
original_data_2021$imputation <- "0"
all_data_2021 <- bind_rows( original_data_2021,
bind_rows( implist_2021, .id = "imputation" ) ) # merge datasets
all_data_2021$imputation <- as.numeric( all_data_2021$imputation )
summary( all_data_2021 )
weight_strata sexual_identity_2021 education income sampling_strata_region calibrated_weight
20 : 20202 Heterosexual :429863 <=9 years : 60506 Min. :-15246 Täby : 14385 Min. : 6.221
12 : 19761 Homosexual : 7869 10-12 years:169887 1st Qu.: 2389 Danderyd : 14133 1st Qu.: 42.552
18 : 19677 Bisexual : 12624 >=13 years :253521 Median : 3324 Skarpnäck : 13965 Median : 69.147
4 : 19656 None of the above: 33522 NA's : 472 Mean : 4114 Värmdö : 13902 Mean : 79.226
2 : 19614 NA's : 508 3rd Qu.: 4480 Kungsholmen: 13902 3rd Qu.:106.423
17 : 19593 Max. :497851 Södermalm : 13902 Max. :370.732
(Other):365883 NA's :40 (Other) :400197
no.of.population age sex country_of_birth marital_status imputation
Min. : 7470 Min. : 16.00 Male :223608 Sweden :380121 Never married :170730 Min. : 0
1st Qu.: 28286 1st Qu.: 38.00 Female:260778 Europe : 50610 Currently married:227157 1st Qu.: 5
Median : 46076 Median : 53.00 Outside Europe: 53655 Other : 86499 Median :10
Mean : 48535 Mean : 52.19 Mean :10
3rd Qu.: 63834 3rd Qu.: 67.00 3rd Qu.:15
Max. :106702 Max. :100.00 Max. :20
# sexual identity in 2021
ggplot( all_data_2021[ !is.na( all_data_2021$sexual_identity_2021 ), ],
aes( fill = sexual_identity_2021, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Sexual identity in 2021" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# education
ggplot( all_data_2021[ !is.na( all_data_2021$education ), ],
aes( fill = education, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Level of education" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# income
summary( all_data_2021$income )
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-15246 2389 3324 4114 4480 497851 40
nrow( all_data_2021[ all_data_2021$income < 0 & !is.na( all_data_2021$income ), ] ) # 194 imputed values are negative
[1] 194